Windows and entropies

Words per book


In [1]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
import statsmodels

In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf

In [3]:
someBooks, _ = aBookCollection.exclude_authors_with_less_than(10).sample_authors(5).split_at_number_per_author(10)
aPossibleFeatureAnalyzer = bc.PossibleFeatureAnalyzer.from_book_collection(someBooks)

In [4]:
freqDf = aPossibleFeatureAnalyzer.frequencies().dataframe_total()

In [5]:
freqDf.apply(numpy.log10).plot(kind='kde')


Out[5]:
<matplotlib.axes.AxesSubplot at 0x7f8d53fba2d0>

In [6]:
import statsmodels.graphics.gofplots as gp
import scipy.stats

_ = gp.qqplot(freqDf.Value.apply(numpy.log10), scipy.stats.distributions.uniform())



In [7]:
entrDf = aPossibleFeatureAnalyzer.entropies().dataframe_total()
entrDf.plot(kind='kde')


Out[7]:
<matplotlib.axes.AxesSubplot at 0x7f8d536e52d0>

In [8]:
_ = gp.qqplot(entrDf.Value, scipy.stats.distributions.uniform())



In [9]:
#blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0.35, 1)
blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0, 1)

In [10]:
freqDf2 = blah.frequencies().dataframe_total()

In [11]:
freqDf2.apply(numpy.log10).plot(kind='kde')


Out[11]:
<matplotlib.axes.AxesSubplot at 0x7f8d50e8ba50>

In [12]:
_ = gp.qqplot(freqDf2.Value.apply(numpy.log10), scipy.stats.distributions.uniform())



In [13]:
entrDf2 = blah.entropies().dataframe_total()

In [14]:
entrDf2.plot(kind='kde')


Out[14]:
<matplotlib.axes.AxesSubplot at 0x7f8d539f8f50>

In [15]:
entrDf2.sort('Value').head(10)


Out[15]:
Value
ides -0
sabres -0
cables -0
ites -0
contralto -0
hiking -0
entertainments -0
trifler -0
predilection -0
pacers -0

In [16]:
plt.figsize(10, 6)
entrPnl2 = blah.entropies().dataframe_authors()
entrPnl2.hist()


Out[16]:
array([[<matplotlib.axes.AxesSubplot object at 0x7f8d50d10210>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d5373af50>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d53440190>],
       [<matplotlib.axes.AxesSubplot object at 0x7f8d536ab650>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d505f3ed0>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d53561290>]], dtype=object)

In [17]:
freqPnl2 = blah.frequencies().dataframe_authors()
freqPnl2.hist(log=True)


Out[17]:
array([[<matplotlib.axes.AxesSubplot object at 0x7f8d535583d0>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d534a3310>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d53464f10>],
       [<matplotlib.axes.AxesSubplot object at 0x7f8d534990d0>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d50896950>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d5083bcd0>]], dtype=object)

In [18]:
mydata = []
df = blah.entropies().dataframe_authors()
for col in df:
    arr = [df[col].dropna().quantile(v/50) for v in range(50)]
    mydata.append(arr)

In [19]:
import statsmodels.api as sm
sm.graphics.fboxplot(mydata)


Out[19]:
(<matplotlib.figure.Figure at 0x7f8d5053a610>,
 array([ 0.686,  0.712,  0.808,  0.802,  0.654]),
 array([2, 3, 1, 0, 4]),
 array([0, 1, 4]))

In [ ]: